# running libraries to use
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(arcos)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel)
library(tidycensus)
library(dplyr)
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(mapview)
library(lsr)
library(corrr)
##
## Attaching package: 'corrr'
## The following object is masked from 'package:lsr':
##
## correlate
library(stringr)
# In our last in-class lab, we wrote code to rank zip codes based on the median income in a county and how many pills per person the county received from 2006-2012. At the end of the lab, I ran the data for all of the counties in the country. Right now, I am going to run the data for the individual towns within Anne Arundel County. I've looked at the rankings before, but now I will visualize them.
# This is from lab_O7. I requested my own key from the census:
key <- "uO4EK6I"
census_api_key("366af81ca42273ae67ad0729766f54f041bd300d")
## To install your API key for use in future sessions, run this function with `install = TRUE`.
# The following graf is from lab_07:
# Store a table with the median household income value for each county using the get_acs() function. ACS is short for American Community Survey, one of two main census products. In the table that's loaded in, the :estimate" is the median household income for that county, averaged over 5 years ending in 2006.
county_median_household_income <- get_acs(geography = "county", variables = c("B19013_001"), survey="acs5", year = 2012)
## Getting data from the 2008-2012 5-year ACS
# How did I get the variables? I pulled in a table from the tidycensus package that lists all of the thousands of variables available through the census. Load the table below and view it. Use filters in the R Viewer window to find things you might want to use later. You can also find table and variable numbers at https://data.census.gov/cedsci/.
acs_variables <- load_variables(2012, "acs5", cache = TRUE)
# Filter just for Maryland counties
md_county_median_household_income <- county_median_household_income %>%
filter(str_detect(NAME, ", Maryland"))
# In the state of Maryland, where does Anne Arundel County rank in terms of pills received?
arcos_county_pills_per_year <- summarized_county_annual(key = key) %>%
clean_names()
maryland_pills_2012 <- arcos_county_pills_per_year %>%
filter(buyer_state == "MD", year=="2012") %>%
select(buyer_county, year, dosage_unit, countyfips)
maryland_population_2012 <- county_population(key = key) %>%
clean_names() %>%
filter(buyer_state == "MD", year=="2012") %>%
select(county_name, population, countyfips)
maryland_2012 <- maryland_pills_2012 %>%
inner_join(maryland_population_2012, by=("countyfips")) %>%
select(buyer_county, dosage_unit, population, countyfips)
md_2012_pills_per_person <- maryland_2012 %>%
select(buyer_county, dosage_unit, population, countyfips) %>%
mutate(pills_per_person = (dosage_unit / population))
md_county_median_household_income <- md_county_median_household_income %>%
mutate(countyfips = GEOID)
md_2012_household_pills <- md_county_median_household_income %>%
inner_join(md_2012_pills_per_person, by=("countyfips")) %>%
arrange(desc(dosage_unit))
# (1) From this, we learn that Anne Arundel County ranks third in the most pills received in the state of Maryland.
# In the state of Maryland, where does Anne Arundel County fall in terms of median household income?
md_2012_household_pills %>%
arrange(desc(estimate))
## # A tibble: 24 x 10
## GEOID NAME variable estimate moe countyfips buyer_county dosage_unit
## <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 24027 Howa… B19013_… 107821 1590 24027 HOWARD 8187185
## 2 24031 Mont… B19013_… 96985 1290 24031 MONTGOMERY 16492456
## 3 24017 Char… B19013_… 93063 1923 24017 CHARLES 5426630
## 4 24009 Calv… B19013_… 92395 3557 24009 CALVERT 4377640
## 5 24003 Anne… B19013_… 86987 1063 24003 ANNE ARUNDEL 19928983
## 6 24035 Quee… B19013_… 86013 3049 24035 QUEEN ANNES 1551490
## 7 24037 St. … B19013_… 85032 1995 24037 SAINT MARYS 3959000
## 8 24021 Fred… B19013_… 83706 1238 24021 FREDERICK 8439302
## 9 24013 Carr… B19013_… 83155 1624 24013 CARROLL 5818440
## 10 24025 Harf… B19013_… 80441 1249 24025 HARFORD 11416250
## # … with 14 more rows, and 2 more variables: population <int>,
## # pills_per_person <dbl>
# (2) Anne Arundel County is the fifth-richest county in Maryland. Baltimore County, which received the most pills, ranks 12th. The poorest county in Maryland, Allegany County, ranks 12th for the most number of pills received. In Maryland, there isn't a direct correlation between median household income and number of pills received. I want to find out whether or not there is a national trend.
# Over the 2006-2012 period that the ARCOS database tracks, where does Anne Arundel County fall nationally in terms of how many pills it received each year?
country_pills_2012 <- arcos_county_pills_per_year %>%
filter(year=="2006") %>%
select(buyer_county, buyer_state, year, dosage_unit, countyfips) %>%
arrange(desc(dosage_unit))
# Where Anne Arundel County ranks in the country by year:
# 2006: 108, with 14376402 pills
# 2007: 112, with 15836968 pills
# 2008: 124, with 16671976 pills
# 2009: 117, with 18239025 pills
# 2010: 124, with 19151845 pills
# 2011: 126, with 20354650 pills
# 2012: 125, with 19928983 pills
# (3) Anne Arundel County ranked the highest in 2006, at 108 with a total 14,376,402 pills received.
# Where does Anne Arundel County fall nationally in terms of median household income?
country_income_05_09 <- get_acs(geography = "county", variables = c("B19013_001"), year = 2009) %>%
arrange(desc(estimate))
## Getting data from the 2005-2009 5-year ACS
# (4) From 2005-2009, Anne Arundel County ranked 34th in median household income in the country. This dataset has a large margin of error.
# Where does Anne Arundel County fall nationally in terms of median household income?
country_income_08_12 <- get_acs(geography = "county", variables = c("B19013_001"), year = 2012) %>%
arrange(desc(estimate))
## Getting data from the 2008-2012 5-year ACS
# (5) From 2008-2012, Anne Arundel County moved up in rank to 27th in median household income in the country.
# How does dosage_unit relate to median household income?
country_income_08_12 <- country_income_08_12 %>%
mutate(countyfips = GEOID)
country_income_pills <- country_income_08_12 %>%
inner_join(country_pills_2012, by=("countyfips")) %>%
arrange(desc(dosage_unit))
ggplot(country_income_pills) +
geom_point(aes(dosage_unit, estimate)) +
labs(title="Dosage Unit by Median Income", caption = "Source: DEA ARCOS database, via Washington Post", fill="buyer_county") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(dosage_unit, estimate), method = "lm", se = FALSE)

# (6) From this, I can see that the county with the highest dosage unit was under the country's average for median household income.
v17 <- load_variables(2017, "acs5", cache = TRUE)
x <- v17 %>%
filter(str_detect(concept, "EMPLOY"))
# This should tell me the employment status of the labor force in Anne Arundel County in 2012. USE VARIABLE B23025_002
aac_labor_force <- get_acs(geography = "county", variables = c("B23025_002"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# (7) From the 2008-2012 census data, there were 303,444 people in the workforce.
# This is the breakdown of black people in Anne Arundel County. USE VARIABLE B02001_003
aac_black <- get_acs(geography = "county", variables = c("B02001_003"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# (8) In Anne Arundel County in 2012, there were 82,542 black people.
# This is the breakdown of asian people in Anne Arundel County. USE VARIABLE B02001_005
aac_asian <- get_acs(geography = "county", variables = c("B02001_005"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# (9) In 2012, there were 18,618 asian people in Anne Arundel County.
# This will give a breakdown of the white population in Anne Arundel County. USE VARIABLE B02001_002
aac_white <- get_acs(geography = "county", variables = c("B02001_002"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# (10) In Anne Arundel County in 2012, there were 408,521 white people.
# What was the population of Anne Arundel County in 2012?
aac_population_2012 <- county_population(state = "MD", county = "Anne Arundel", key = key)
# (11) In 2012, Anne Arundel County had a population of 538,473 people. Based on my three previous code blocks, Anne Arundel County is largely white.
# Does the variable for 'all races' work the same as how I previously found the total population?
aac_race <- get_acs(geography = "county", variables = c("B02001_001"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# (12) This gives the same number as the county_population I did in the previous code block.
# My research has said the opioid crisis largely impacted white people. So how does pills per person compare to the non-white population?
race_visual <- aac_asian %>%
inner_join(aac_black, by=("GEOID")) %>%
inner_join(aac_white, by=("GEOID")) %>%
select(GEOID, NAME.x, estimate.x, estimate.y, estimate)
race_visual %>%
rename("black" = "estimate.y") %>%
rename("white" = "estimate") %>%
rename("asian" = "estimate.x")
## # A tibble: 3,221 x 5
## GEOID NAME.x asian black white
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 01001 Autauga County, Alabama 439 9880 43084
## 2 01003 Baldwin County, Alabama 1347 17016 158259
## 3 01005 Barbour County, Alabama 212 12645 13448
## 4 01007 Bibb County, Alabama 25 4953 17459
## 5 01009 Blount County, Alabama 97 754 54507
## 6 01011 Bullock County, Alabama 0 7609 3028
## 7 01013 Butler County, Alabama 183 8961 11380
## 8 01015 Calhoun County, Alabama 827 24421 88405
## 9 01017 Chambers County, Alabama 178 13409 20171
## 10 01019 Cherokee County, Alabama 78 1253 24074
## # … with 3,211 more rows
# It took me a very long time to make this work, and it isn't working in View, only here.
race <- get_acs(geography = "county", variables = c("B02001_001"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
poverty_rate <- get_acs(geography = "county", variables = c("B06012_002"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
# What is the poverty rate per county?
pop_poverty <- race %>%
inner_join(poverty_rate, by=c("GEOID")) %>%
select(GEOID, NAME.x, estimate.x, estimate.y) %>%
mutate(poverty_rate = estimate.y/estimate.x *100)
# I tried several times to rename the columns: estimate.x to "population" and estimate.y to "poverty. Neither the rename nor mutate functions worked. I tried colnames, as well. It kept telling me the object wasn't found, those column types weren't supported or spouting random error messages. I'm not sure my dplyr is properly installed.
# (13) With this, I joined the population and poverty tables and created a new column to calculate the poverty rate in every county in the country (except Puerto Rico, which is not included in this dataset).
nonwhite_population <- aac_asian %>%
inner_join(aac_black, by="GEOID") %>%
select(GEOID, NAME.x, estimate.x, estimate.y) %>%
mutate(nonwhite_total = estimate.x + estimate.y) %>%
select(GEOID, NAME.x, nonwhite_total)
# I need to find the total population of men in the workforce, then the total number of women in the workforce. Then I will add those together and divide it by the total population, and this will give me a (very) rough idea of what the unemployment rate is.
# The variable I'm using is unemployment based on health care enrollment. I want to see if this works. B27011_008
unemployment_by_healthcare <- get_acs(geography = "county", variables = c("B27011_008"), year = 2012)
## Getting data from the 2008-2012 5-year ACS
total_unemplyment <- unemployment_by_healthcare %>%
inner_join(race, by="GEOID") %>%
select(GEOID, NAME.x, estimate.x, estimate.y) %>%
mutate(unemployment_rate = estimate.x/estimate.y *100)
# (17) These numbers aren't as outrageous as I thought they would be, so I could see this being fairly accurate.
# What is the percentage of white residents in each county?
white_pop <- aac_white %>%
inner_join(race, by="GEOID") %>%
select(GEOID, NAME.x, estimate.x, estimate.y) %>%
mutate(white_percentage = estimate.x/estimate.y *100)
# (15) With this, I found the number of people who identify as white and then compared that to the county's total population to find the percentage of white residents in each county. Much of the research says the opioid crisis largely impacted the white population, and I want to see if that's true. The next step will be to compare this to the dosage unit.
# How do the white population and poverty rate compare?
white_poverty <- pop_poverty %>%
inner_join(white_pop, by="GEOID") %>%
select(GEOID, NAME.x.x, poverty_rate, white_percentage)
pills <- summarized_county_annual(key = key)
# I want to combine all of the variables I've found into one table so I can start comparing them and see if anything stands out.
everything <- total_unemplyment %>%
inner_join(white_poverty, by="GEOID") %>%
select(GEOID, NAME.x, unemployment_rate, poverty_rate, white_percentage)
# (18) So far, I don't see too many shocking trends. The places with the highest poverty rate generally have a lower white population. Poverty and unemployment are generally similar. I need to add in dosage unit to get the key piece of my analysis.
# This now includes pills and pills per person, in addition to the other variables I found, to see if anything sticks out.
pills_population <- pills %>%
inner_join(race, by=c("countyfips" = "GEOID")) %>%
group_by(BUYER_COUNTY, BUYER_STATE, countyfips) %>%
summarise(total_pills = sum(DOSAGE_UNIT), total_population = sum(estimate)) %>%
mutate(pills_per_person = total_pills/total_population) %>%
inner_join(everything, by=c("countyfips" = "GEOID"))
# (19) This features every county/countyfips in the country, and includes the following information for each for 2012: total pills, total population, pills per person, unemployment rate, poverty rate, and percentage of white population. Using this, I should be able to see what/if any of these fields are correlated.
county_geodata_shifted <- get_acs(geography = "county",
variables = "B01001_001", geometry = TRUE, shift_geo = TRUE)
## Getting data from the 2013-2017 5-year ACS
## Using feature geometry obtained from the albersusa package
## Please note: Alaska and Hawaii are being shifted and are not to scale.
pills_population_map <- county_geodata_shifted %>%
inner_join(pills_population, by=c("GEOID" = "countyfips"))
nonwhite_pills <- nonwhite_population %>%
inner_join(pills_population_map, by="GEOID") %>%
select(GEOID, NAME, nonwhite_total, pills_per_person)
ggplot(nonwhite_pills) +
geom_point(aes(nonwhite_total, pills_per_person)) +
labs(title="Pills Per Person in Non-White Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(nonwhite_total, pills_per_person), method = "lm", se = FALSE)

# (14) From this, we can see that, overall, the smaller the number of non-white people in a county, the smaller the number of pills per person.
aac_all_races <- aac_white %>%
inner_join(aac_black, by=("GEOID"))
ggplot(white_poverty) +
geom_point(aes(poverty_rate, white_percentage)) +
labs(title="Poverty Rate Compared to White Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(poverty_rate, white_percentage), method = "lm", se = FALSE)
## Warning: Removed 78 rows containing non-finite values (stat_smooth).
## Warning: Removed 78 rows containing missing values (geom_point).

# (16) This isn't perfect, but from this graph, we can see that the areas with a higher white percentage have a much lower poverty rate. Now I need to add in dosage unit and unemployment.
# How would I effectively visualize the pills per person in each county?
mapview(pills_population_map, zcol = "pills_per_person", legend = TRUE)
# (20) This is a visual representation of how many pills per person was in each county.
# Do total pills directly correlate to the total popluation? As in, the higher the population, the higher the total pills?
ggplot(pills_population) +
geom_point(aes(total_pills, total_population)) +
labs(title="Total Population and Total Pills", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(total_pills, total_population), method = "lm", se = FALSE)

# (21) This isn't too telling, but it does show some outliers in how many pills some counties received in relation to its total population.
# What is an easy way to show the relationship between many variables all at once?
pills_population <- as.data.frame(pills_population)
pills_population %>%
select(-BUYER_COUNTY, -BUYER_STATE, -countyfips, -NAME.x) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 6 x 7
## rowname total_pills total_population pills_per_person unemployment_ra…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 total_… NA 0.905 0.109 0.0154
## 2 total_… 0.905 NA -0.0328 0.0188
## 3 pills_… 0.109 -0.0328 NA 0.0116
## 4 unempl… 0.0154 0.0188 0.0116 NA
## 5 povert… -0.0461 -0.0621 0.263 0.335
## 6 white_… -0.178 -0.195 0.0764 -0.202
## # … with 2 more variables: poverty_rate <dbl>, white_percentage <dbl>
# (22) The correlate function allows me to quickly see the relationships between each of these factors without creating a ton of scatter plots. There isn't too high of a correlation between anything I'm looking at, but the highest correlations are between total pills and white percentage, and poverty rate and pills per person.
# Create a visualization that shows the correlation between total pills and the white population.
ggplot(pills_population) +
geom_point(aes(total_pills, white_percentage)) +
labs(title="How Total Pills Correlates with White Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(total_pills, white_percentage), method = "lm", se = FALSE)

# (23) I wanted to visualize the correlation between total_pills and white_percentage from what I found in the previous block, as this was one of the two stronger correlations. It shows that the higher the percentage of the white population, the higher the total pills.
# What is the relationship between poverty rate and pills per person?
ggplot(pills_population) +
geom_point(aes(poverty_rate, pills_per_person)) +
labs(title="Pills Per Person and Poverty Rate", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(poverty_rate, pills_per_person), method = "lm", se = FALSE)

# (24) This scatter plot is the most interesting to look at, as the points are more spread out. There are a few outliers, but you can see that the highest density of of pills per person is when the poverty rate is lower. This is relatively in line with what I found when researching the opioid crisis, which said it largely impacted the white and poor populations.
# Now that I've established a few national trends, I want to go back and see how Anne Arundel County fits in. In the beginning, I found that Anne Arundel County is largely white, so how does the county's total pills correlate with the white population?
pills_county <- everything %>%
inner_join(pills, by=c("GEOID" = "countyfips")) %>%
group_by(NAME.x, BUYER_COUNTY, BUYER_STATE, GEOID) %>%
summarise(total_pills = sum(DOSAGE_UNIT))
everything_county <- everything %>%
inner_join(pills_county, by="GEOID") %>%
select(GEOID, NAME.x.x, unemployment_rate, poverty_rate, white_percentage, total_pills)
# everything <- everything %>%
# inner_join(pills_county, by="GEOID") %>%
# select(GEOID, NAME.x, unemployment_rate, poverty_rate, white_percentage)
everything_county %>%
filter(str_detect(NAME.x.x, "Maryland")) %>%
select(-GEOID, -NAME.x.x) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 4 x 5
## rowname unemployment_ra… poverty_rate white_percentage total_pills
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 unemployment_… NA 0.545 -0.378 -0.0178
## 2 poverty_rate 0.545 NA -0.273 -0.0486
## 3 white_percent… -0.378 -0.273 NA -0.474
## 4 total_pills -0.0178 -0.0486 -0.474 NA
# (25) Total pills does not at all correlate with unemployment or poverty, meaning this was more of a wealthy issue. It most strongly correlates with the white percentage, but not too shocking.
# What is the relationship between total pills and white population?
ggplot(everything_county) +
geom_point(aes(white_percentage, total_pills)) +
labs(title="Total Pills and White Popluation", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(white_percentage, total_pills), method = "lm", se = FALSE)

# (26) This is a visual of total pills and white population in each county. There are many outliers, but generally, the higher the percentage of the white population, the higher the total pills sent to a county.
# At some point, median household income got lost and disappeared from 'everything,' so now I'm going to add it back in and see if that offers any further information on how these factors relate to each other.
everything_income <- everything_county %>%
inner_join(county_median_household_income, by="GEOID") %>%
select(GEOID, NAME.x.x, unemployment_rate, white_percentage, poverty_rate, total_pills, estimate)
everything_income %>%
filter(str_detect(NAME.x.x, "Maryland")) %>%
select(-GEOID, -NAME.x.x) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 5 x 6
## rowname unemployment_ra… white_percentage poverty_rate total_pills
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 unempl… NA -0.378 0.545 -0.0178
## 2 white_… -0.378 NA -0.273 -0.474
## 3 povert… 0.545 -0.273 NA -0.0486
## 4 total_… -0.0178 -0.474 -0.0486 NA
## 5 estima… -0.566 -0.0166 -0.871 0.172
## # … with 1 more variable: estimate <dbl>
# (27) It seems here that the highest correlation is between median household income and total pills. I'm going to create a visual to see if it's any more clear.
# What is the relationship between median household income and total pills?
ggplot(everything_income) +
geom_point(aes(estimate, total_pills)) +
labs(title="Total Pills and Median Household Income", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(estimate, total_pills), method = "lm", se = FALSE)

# (28) This shows the opposite of what I saw before. It seems counties with a median household income of ~$50,000 had the most total pills. To see how much weight this holds, I would need to know more about the demographics for those specific counties and how many counties in the country fall into this income range.
# What is the breakdown of pills per person and median household income?
pills_median <- everything_income %>%
inner_join(pills_population, by=c("NAME.x.x" = "NAME.x")) %>%
select(GEOID, NAME.x.x, pills_per_person, total_pills.x, estimate)
ggplot(pills_median) +
geom_point(aes(estimate, pills_per_person)) +
labs(title="Pills Per Person and Median Household Income", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(estimate, pills_per_person), method = "lm", se = FALSE)

# (29) This shows that the lower the median household income, the larger the pills per person.
# Now I want to see how the total pills correlates with each race.
pills_race <- everything_income %>%
inner_join(aac_asian, by="GEOID") %>%
inner_join(aac_black, by="GEOID") %>%
inner_join(aac_race, by="GEOID") %>%
rename("black" = "estimate.x.x") %>%
rename("white" = "estimate.y.y") %>%
rename("asian" = "estimate.y")%>%
select(GEOID, NAME.x.x, black, white, asian, total_pills)
# What is the relationship between the three race variables and total pills?
pills_race %>%
filter(str_detect(NAME.x.x, "Maryland")) %>%
select(-GEOID, -NAME.x.x) %>%
correlate()
##
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
## # A tibble: 4 x 5
## rowname black white asian total_pills
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 black NA 0.805 0.420 0.667
## 2 white 0.805 NA 0.796 0.877
## 3 asian 0.420 0.796 NA 0.530
## 4 total_pills 0.667 0.877 0.530 NA
# (30) Really pleased that I was able to do this and rename all the columns. These numbers are actually a lot higher and closer than I thought they'd be, but it definitely looks like the highest correlation is between the white population and total pills.
# What is the relationship between the black population and total pills?
ggplot(pills_race) +
geom_point(aes(black, total_pills)) +
labs(title="Total Pills and Black Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(black, total_pills), method = "lm", se = FALSE)

# (31) When visualized, this makes the correlation between the black population and total pills look smaller.
# What is the relationships between the white population adn total pills?
ggplot(pills_race) +
geom_point(aes(white, total_pills)) +
labs(title="Total Pills and White Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(white, total_pills), method = "lm", se = FALSE)

# (32) This is more of a narrow and upward trend than the black population and has much fewer outliers.
# What is the relationship between the asian population and total pills?
ggplot(pills_race) +
geom_point(aes(asian, total_pills)) +
labs(title="Total Pills and Asian Population", caption = "Source: DEA ARCOS database, via Washington Post", fill="") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_smooth(aes(asian, total_pills), method = "lm", se = FALSE)

# (33) Fewer outliers than with the black population, but not as much of a direct or narrow trend as with the white population.